home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Personal Computer World 2009 February
/
PCWFEB09.iso
/
Software
/
Resources
/
Chat & Communication
/
Digsby build 37
/
digsby_setup.exe
/
lib
/
lxml
/
html
/
diff.pyo
(
.txt
)
< prev
next >
Wrap
Python Compiled Bytecode
|
2008-10-13
|
18KB
|
637 lines
# Source Generated with Decompyle++
# File: in.pyo (Python 2.5)
import difflib
from lxml import etree
from lxml.html import fragment_fromstring
import cgi
import re
__all__ = [
'html_annotate',
'htmldiff']
try:
_unicode = unicode
except NameError:
_unicode = str
try:
basestring = __builtins__['basestring']
except (KeyError, NameError):
basestring = str
def default_markup(text, version):
return '<span title="%s">%s</span>' % (cgi.escape(_unicode(version), 1), text)
def html_annotate(doclist, markup = default_markup):
tokenlist = [ tokenize_annotated(doc, version) for doc, version in doclist ]
cur_tokens = tokenlist[0]
for tokens in tokenlist[1:]:
html_annotate_merge_annotations(cur_tokens, tokens)
cur_tokens = tokens
cur_tokens = compress_tokens(cur_tokens)
result = markup_serialize_tokens(cur_tokens, markup)
return ''.join(result).strip()
def tokenize_annotated(doc, annotation):
tokens = tokenize(doc, include_hrefs = False)
for tok in tokens:
tok.annotation = annotation
return tokens
def html_annotate_merge_annotations(tokens_old, tokens_new):
s = InsensitiveSequenceMatcher(a = tokens_old, b = tokens_new)
commands = s.get_opcodes()
for command, i1, i2, j1, j2 in commands:
if command == 'equal':
eq_old = tokens_old[i1:i2]
eq_new = tokens_new[j1:j2]
copy_annotations(eq_old, eq_new)
continue
def copy_annotations(src, dest):
for src_tok, dest_tok in zip(src, dest):
dest_tok.annotation = src_tok.annotation
def compress_tokens(tokens):
result = [
tokens[0]]
for tok in tokens[1:]:
if not (result[-1].post_tags) and not (tok.pre_tags) and result[-1].annotation == tok.annotation:
compress_merge_back(result, tok)
continue
result.append(tok)
return result
def compress_merge_back(tokens, tok):
last = tokens[-1]
if type(last) is not token or type(tok) is not token:
tokens.append(tok)
else:
text = _unicode(last)
if last.trailing_whitespace:
text += ' '
text += tok
merged = token(text, pre_tags = last.pre_tags, post_tags = tok.post_tags, trailing_whitespace = tok.trailing_whitespace)
merged.annotation = last.annotation
tokens[-1] = merged
def markup_serialize_tokens(tokens, markup_func):
for token in tokens:
for pre in token.pre_tags:
yield pre
html = token.html()
html = markup_func(html, token.annotation)
if token.trailing_whitespace:
html += ' '
yield html
for post in token.post_tags:
yield post
def htmldiff(old_html, new_html):
old_html_tokens = tokenize(old_html)
new_html_tokens = tokenize(new_html)
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
result = ''.join(result).strip()
return fixup_ins_del_tags(result)
def htmldiff_tokens(html1_tokens, html2_tokens):
s = InsensitiveSequenceMatcher(a = html1_tokens, b = html2_tokens)
commands = s.get_opcodes()
result = []
for command, i1, i2, j1, j2 in commands:
if command == 'equal':
result.extend(expand_tokens(html2_tokens[j1:j2], equal = True))
continue
if command == 'insert' or command == 'replace':
ins_tokens = expand_tokens(html2_tokens[j1:j2])
merge_insert(ins_tokens, result)
if command == 'delete' or command == 'replace':
del_tokens = expand_tokens(html1_tokens[i1:i2])
merge_delete(del_tokens, result)
continue
result = cleanup_delete(result)
return result
def expand_tokens(tokens, equal = False):
for token in tokens:
for pre in token.pre_tags:
yield pre
if not equal or not (token.hide_when_equal):
if token.trailing_whitespace:
yield token.html() + ' '
else:
yield token.html()
for post in token.post_tags:
yield post
def merge_insert(ins_chunks, doc):
(unbalanced_start, balanced, unbalanced_end) = split_unbalanced(ins_chunks)
doc.extend(unbalanced_start)
if doc and not doc[-1].endswith(' '):
doc[-1] += ' '
doc.append('<ins>')
if balanced and balanced[-1].endswith(' '):
balanced[-1] = balanced[-1][:-1]
doc.extend(balanced)
doc.append('</ins> ')
doc.extend(unbalanced_end)
class DEL_START:
pass
class DEL_END:
pass
class NoDeletes(Exception):
pass
def merge_delete(del_chunks, doc):
doc.append(DEL_START)
doc.extend(del_chunks)
doc.append(DEL_END)
def cleanup_delete(chunks):
while None:
try:
(pre_delete, delete, post_delete) = split_delete(chunks)
except NoDeletes:
break
(unbalanced_start, balanced, unbalanced_end) = split_unbalanced(delete)
locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
doc = pre_delete
if doc and not doc[-1].endswith(' '):
doc[-1] += ' '
doc.append('<del>')
if balanced and balanced[-1].endswith(' '):
balanced[-1] = balanced[-1][:-1]
doc.extend(balanced)
doc.append('</del> ')
doc.extend(post_delete)
chunks = doc
continue
return chunks
def split_unbalanced(chunks):
start = []
end = []
tag_stack = []
balanced = []
for chunk in chunks:
if not chunk.startswith('<'):
balanced.append(chunk)
continue
endtag = chunk[1] == '/'
name = chunk.split()[0].strip('<>/')
if name in empty_tags:
balanced.append(chunk)
continue
if endtag:
if tag_stack and tag_stack[-1][0] == name:
balanced.append(chunk)
(name, pos, tag) = tag_stack.pop()
balanced[pos] = tag
elif tag_stack:
[]([ tag for name, pos, tag in tag_stack ])
tag_stack = []
end.append(chunk)
else:
end.append(chunk)
tag_stack[-1][0] == name
tag_stack.append((name, len(balanced), chunk))
balanced.append(None)
[]([ chunk for name, pos, chunk in tag_stack ])
balanced = _[3]
return (start, balanced, end)
def split_delete(chunks):
try:
pos = chunks.index(DEL_START)
except ValueError:
raise NoDeletes
pos2 = chunks.index(DEL_END)
return (chunks[:pos], chunks[pos + 1:pos2], chunks[pos2 + 1:])
def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
while not unbalanced_start:
break
finding = unbalanced_start[0]
finding_name = finding.split()[0].strip('<>')
if not post_delete:
break
next = post_delete[0]
if next is DEL_START or not next.startswith('<'):
break
if next[1] == '/':
break
name = next.split()[0].strip('<>')
if name == 'ins':
break
if name == finding_name:
unbalanced_start.pop(0)
pre_delete.append(post_delete.pop(0))
continue
break
continue
def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
while not unbalanced_end:
break
finding = unbalanced_end[-1]
finding_name = finding.split()[0].strip('<>/')
if not pre_delete:
break
next = pre_delete[-1]
if next is DEL_END or not next.startswith('</'):
break
name = next.split()[0].strip('<>/')
if name == 'ins' or name == 'del':
break
if name == finding_name:
unbalanced_end.pop()
post_delete.insert(0, pre_delete.pop())
continue
break
continue
class token(_unicode):
hide_when_equal = False
def __new__(cls, text, pre_tags = None, post_tags = None, trailing_whitespace = False):
obj = _unicode.__new__(cls, text)
if pre_tags is not None:
obj.pre_tags = pre_tags
else:
obj.pre_tags = []
if post_tags is not None:
obj.post_tags = post_tags
else:
obj.post_tags = []
obj.trailing_whitespace = trailing_whitespace
return obj
def __repr__(self):
return 'token(%s, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, self.post_tags)
def html(self):
return _unicode(self)
class tag_token(token):
def __new__(cls, tag, data, html_repr, pre_tags = None, post_tags = None, trailing_whitespace = False):
obj = token.__new__(cls, '%s: %s' % (type, data), pre_tags = pre_tags, post_tags = post_tags, trailing_whitespace = trailing_whitespace)
obj.tag = tag
obj.data = data
obj.html_repr = html_repr
return obj
def __repr__(self):
return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % (self.tag, self.data, self.html_repr, self.pre_tags, self.post_tags, self.trailing_whitespace)
def html(self):
return self.html_repr
class href_token(token):
hide_when_equal = True
def html(self):
return 'Link: %s' % self
def tokenize(html, include_hrefs = True):
body_el = parse_html(html, cleanup = True)
chunks = flatten_el(body_el, skip_tag = True, include_hrefs = include_hrefs)
return fixup_chunks(chunks)
def parse_html(html, cleanup = True):
if cleanup:
html = cleanup_html(html)
return fragment_fromstring(html, create_parent = True)
_body_re = re.compile('<body.*?>', re.I | re.S)
_end_body_re = re.compile('</body.*?>', re.I | re.S)
_ins_del_re = re.compile('</?(ins|del).*?>', re.I | re.S)
def cleanup_html(html):
match = _body_re.search(html)
if match:
html = html[match.end():]
match = _end_body_re.search(html)
if match:
html = html[:match.start()]
html = _ins_del_re.sub('', html)
return html
end_whitespace_re = re.compile('[ \\t\\n\\r]$')
def fixup_chunks(chunks):
tag_accum = []
cur_word = None
result = []
for chunk in chunks:
if isinstance(chunk, tuple):
if chunk[0] == 'img':
src = chunk[1]
tag = chunk[2]
if tag.endswith(' '):
tag = tag[:-1]
trailing_whitespace = True
else:
trailing_whitespace = False
cur_word = tag_token('img', src, html_repr = tag, pre_tags = tag_accum, trailing_whitespace = trailing_whitespace)
tag_accum = []
result.append(cur_word)
continue
if chunk[0] == 'href':
href = chunk[1]
cur_word = href_token(href, pre_tags = tag_accum, trailing_whitespace = True)
tag_accum = []
result.append(cur_word)
continue
continue
if is_word(chunk):
if chunk.endswith(' '):
chunk = chunk[:-1]
trailing_whitespace = True
else:
trailing_whitespace = False
cur_word = token(chunk, pre_tags = tag_accum, trailing_whitespace = trailing_whitespace)
tag_accum = []
result.append(cur_word)
continue
None if is_start_tag(chunk) else tag_accum
if not result:
return [
token('', pre_tags = tag_accum)]
else:
result[-1].post_tags.extend(tag_accum)
return result
empty_tags = ('param', 'img', 'area', 'br', 'basefont', 'input', 'base', 'meta', 'link', 'col')
block_level_tags = ('address', 'blockquote', 'center', 'dir', 'div', 'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'isindex', 'menu', 'noframes', 'noscript', 'ol', 'p', 'pre', 'table', 'ul')
block_level_container_tags = ('dd', 'dt', 'frameset', 'li', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr')
def flatten_el(el, include_hrefs, skip_tag = False):
if not skip_tag:
if el.tag == 'img':
yield ('img', el.attrib['src'], start_tag(el))
else:
yield start_tag(el)
if el.tag in empty_tags and not (el.text) and not len(el) and not (el.tail):
return None
start_words = split_words(el.text)
for word in start_words:
yield cgi.escape(word)
for child in el:
for item in flatten_el(child, include_hrefs = include_hrefs):
yield item
if el.tag == 'a' and el.attrib.get('href') and include_hrefs:
yield ('href', el.attrib['href'])
if not skip_tag:
yield end_tag(el)
end_words = split_words(el.tail)
for word in end_words:
yield cgi.escape(word)
def split_words(text):
if not text or not text.strip():
return []
words = [ w + ' ' for w in text.strip().split() ]
return words
start_whitespace_re = re.compile('^[ \\t\\n\\r]')
def start_tag(el):
return ''.join % ([], []([ ' %s="%s"' % (name, cgi.escape(value, True)) for name, value in el.attrib.items() ]))
def end_tag(el):
if el.tail and start_whitespace_re.search(el.tail):
extra = ' '
else:
extra = ''
return '</%s>%s' % (el.tag, extra)
def is_word(tok):
return not tok.startswith('<')
def is_end_tag(tok):
return tok.startswith('</')
def is_start_tag(tok):
if tok.startswith('<'):
pass
return not tok.startswith('</')
def fixup_ins_del_tags(html):
doc = parse_html(html, cleanup = False)
_fixup_ins_del_tags(doc)
html = serialize_html_fragment(doc, skip_outer = True)
return html
def serialize_html_fragment(el, skip_outer = False):
html = etree.tostring(el, method = 'html', encoding = _unicode)
if skip_outer:
html = html[html.find('>') + 1:]
html = html[:html.rfind('<')]
return html.strip()
else:
return html
def _fixup_ins_del_tags(doc):
for tag in [
'ins',
'del']:
for el in doc.xpath('descendant-or-self::%s' % tag):
if not _contains_block_level_tag(el):
continue
_move_el_inside_block(el, tag = tag)
el.drop_tag()
def _contains_block_level_tag(el):
if el.tag in block_level_tags or el.tag in block_level_container_tags:
return True
for child in el:
if _contains_block_level_tag(child):
return True
continue
return False
def _move_el_inside_block(el, tag):
for child in el:
if _contains_block_level_tag(child):
break
continue
else:
import sys as sys
children_tag = etree.Element(tag)
children_tag.text = el.text
el.text = None
el[:] = [
children_tag]
return None
for child in list(el):
if _contains_block_level_tag(child):
_move_el_inside_block(child, tag)
if child.tail:
tail_tag = etree.Element(tag)
tail_tag.text = child.tail
child.tail = None
el.insert(el.index(child) + 1, tail_tag)
child.tail
child_tag = etree.Element(tag)
el.replace(child, child_tag)
child_tag.append(child)
if el.text:
text_tag = etree.Element(tag)
text_tag.text = el.text
el.text = None
el.insert(0, text_tag)
def _merge_element_contents(el):
parent = el.getparent()
if not el.text:
pass
text = ''
if el.tail:
if not len(el):
text += el.tail
elif el[-1].tail:
el[-1].tail += el.tail
else:
el[-1].tail = el.tail
index = parent.index(el)
if text:
if index == 0:
previous = None
else:
previous = parent[index - 1]
if previous is None:
if parent.text:
parent.text += text
else:
parent.text = text
elif previous.tail:
previous.tail += text
else:
previous.tail = text
parent[index:index + 1] = el.getchildren()
class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
threshold = 2
def get_matching_blocks(self):
size = min(len(self.b), len(self.b))
threshold = min(self.threshold, size / 4)
actual = difflib.SequenceMatcher.get_matching_blocks(self)
return _[1]
if __name__ == '__main__':
from lxml.html import _diffcommand
_diffcommand.main()